#define SCRATCH_PFN 0xFFFFF
+#define SPECIALPAGE_GUARD 0
+#define SPECIALPAGE_BUFIOREQ 1
+#define SPECIALPAGE_XENSTORE 2
+#define SPECIALPAGE_IOREQ 3
+#define SPECIALPAGE_IDENT_PT 4
+#define NR_SPECIAL_PAGES 5
+
static void build_e820map(void *e820_page, unsigned long long mem_size)
{
struct e820entry *e820entry =
e820entry[nr_map].type = E820_RESERVED;
nr_map++;
- /*
- * Low RAM goes here. Remove 4 pages for: ioreq, bufioreq, and xenstore.
- * 1. Guard page.
- * 2. Buffered ioreq.
- * 3. Xenstore.
- * 4. Normal ioreq.
- */
+ /* Low RAM goes here. Reserve space for special pages. */
e820entry[nr_map].addr = 0x100000;
- e820entry[nr_map].size = mem_size - 0x100000 - PAGE_SIZE * 4;
+ e820entry[nr_map].size = (mem_size - 0x100000 -
+ PAGE_SIZE * NR_SPECIAL_PAGES);
e820entry[nr_map].type = E820_RAM;
nr_map++;
- /* Explicitly reserve space for special pages. */
- e820entry[nr_map].addr = mem_size - PAGE_SIZE * 3;
- e820entry[nr_map].size = PAGE_SIZE * 3;
+ /* Explicitly reserve space for special pages (excluding guard page). */
+ e820entry[nr_map].addr = mem_size - PAGE_SIZE * (NR_SPECIAL_PAGES - 1);
+ e820entry[nr_map].size = PAGE_SIZE * (NR_SPECIAL_PAGES - 1);
e820entry[nr_map].type = E820_RESERVED;
nr_map++;
{
xen_pfn_t *page_array = NULL;
unsigned long i, nr_pages = (unsigned long)memsize << (20 - PAGE_SHIFT);
- unsigned long shared_page_nr, entry_eip;
+ unsigned long special_page_nr, entry_eip;
struct xen_add_to_physmap xatp;
struct shared_info *shared_info;
void *e820_page;
+ uint32_t *ident_pt;
struct elf_binary elf;
uint64_t v_start, v_end;
int rc;
sizeof(shared_info->evtchn_mask));
munmap(shared_info, PAGE_SIZE);
- if ( v_end > HVM_BELOW_4G_RAM_END )
- shared_page_nr = (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT) - 1;
- else
- shared_page_nr = (v_end >> PAGE_SHIFT) - 1;
+ special_page_nr = (((v_end > HVM_BELOW_4G_RAM_END)
+ ? (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT)
+ : (v_end >> PAGE_SHIFT))
+ - NR_SPECIAL_PAGES);
+
+ /* Paranoia: clean special pages. */
+ for ( i = 0; i < NR_SPECIAL_PAGES; i++ )
+ if ( xc_clear_domain_page(xc_handle, dom, special_page_nr + i) )
+ goto error_out;
/* Free the guard page that separates low RAM from special pages. */
rc = xc_domain_memory_decrease_reservation(
- xc_handle, dom, 1, 0, &page_array[shared_page_nr-3]);
+ xc_handle, dom, 1, 0, &page_array[special_page_nr]);
if ( rc != 0 )
{
PERROR("Could not deallocate guard page for HVM guest.\n");
goto error_out;
}
- /* Paranoia: clean pages. */
- if ( xc_clear_domain_page(xc_handle, dom, shared_page_nr) ||
- xc_clear_domain_page(xc_handle, dom, shared_page_nr-1) ||
- xc_clear_domain_page(xc_handle, dom, shared_page_nr-2) )
- goto error_out;
+ xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
+ special_page_nr + SPECIALPAGE_XENSTORE);
+ xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
+ special_page_nr + SPECIALPAGE_BUFIOREQ);
+ xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
+ special_page_nr + SPECIALPAGE_IOREQ);
- xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, shared_page_nr-1);
- xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, shared_page_nr-2);
- xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, shared_page_nr);
+ /*
+ * Identity-map page table is required for running with CR0.PG=0 when
+ * using Intel EPT. Create a 32-bit non-PAE page directory of superpages.
+ */
+ if ( (ident_pt = xc_map_foreign_range(
+ xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
+ special_page_nr + SPECIALPAGE_IDENT_PT)) == NULL )
+ goto error_out;
+ for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
+ ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+ _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+ munmap(ident_pt, PAGE_SIZE);
+ xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
+ special_page_nr + SPECIALPAGE_IDENT_PT);
/* Insert JMP <rel32> instruction at address 0x0 to reach entry point. */
entry_eip = elf_uval(&elf, elf.ehdr, e_entry);
HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
#endif
- paging_domain_init(d);
+ if ( (rc = paging_domain_init(d)) != 0 )
+ goto fail;
paging_initialised = 1;
if ( !is_idle_domain(d) )
{
d->arch.ioport_caps =
rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
+ rc = -ENOMEM;
if ( d->arch.ioport_caps == NULL )
goto fail;
if ( a.value > HVMPTM_one_missed_tick_pending )
goto param_fail;
break;
+ case HVM_PARAM_IDENT_PT:
+ rc = -EPERM;
+ if ( current->domain->domain_id != 0 )
+ goto param_fail;
+
+ rc = -EINVAL;
+ if ( d->arch.hvm_domain.params[a.index] != 0 )
+ goto param_fail;
+
+ if ( !paging_mode_hap(d) )
+ break;
+
+ domain_pause(d);
+
+ /*
+ * Update GUEST_CR3 in each VMCS to point at identity map.
+ * All foreign updates to guest state must synchronise on
+ * the domctl_lock.
+ */
+ spin_lock(&domctl_lock);
+ d->arch.hvm_domain.params[a.index] = a.value;
+ for_each_vcpu ( d, v )
+ paging_update_cr3(v);
+ spin_unlock(&domctl_lock);
+
+ domain_unpause(d);
+ break;
}
d->arch.hvm_domain.params[a.index] = a.value;
rc = 0;
min = (CPU_BASED_HLT_EXITING |
CPU_BASED_INVLPG_EXITING |
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_MONITOR_EXITING |
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_ACTIVATE_IO_BITMAP |
CPU_BASED_USE_TSC_OFFSETING);
- opt = CPU_BASED_ACTIVATE_MSR_BITMAP;
- opt |= CPU_BASED_TPR_SHADOW;
- opt |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ opt = (CPU_BASED_ACTIVATE_MSR_BITMAP |
+ CPU_BASED_TPR_SHADOW |
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
_vmx_cpu_based_exec_control = adjust_vmx_controls(
min, opt, MSR_IA32_VMX_PROCBASED_CTLS);
#ifdef __x86_64__
{
min = 0;
opt = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
- SECONDARY_EXEC_WBINVD_EXITING);
+ SECONDARY_EXEC_WBINVD_EXITING |
+ SECONDARY_EXEC_ENABLE_EPT);
_vmx_secondary_exec_control = adjust_vmx_controls(
min, opt, MSR_IA32_VMX_PROCBASED_CTLS2);
}
+ if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
+ {
+ /* To use EPT we expect to be able to clear certain intercepts. */
+ uint32_t must_be_one, must_be_zero;
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, must_be_one, must_be_zero);
+ if ( must_be_one & (CPU_BASED_INVLPG_EXITING |
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING) )
+ _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ }
+
#if defined(__i386__)
/* If we can't virtualise APIC accesses, the TPR shadow is pointless. */
if ( !(_vmx_secondary_exec_control &
return 0;
}
+ ept_sync_all();
+
return 1;
}
static int construct_vmcs(struct vcpu *v)
{
+ struct domain *d = v->domain;
uint16_t sysenter_cs;
unsigned long sysenter_eip;
__vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
__vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control);
__vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control);
- __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmx_cpu_based_exec_control);
+
v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
- if ( vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS )
- __vmwrite(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control);
+ v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control;
+
+ if ( paging_mode_hap(d) )
+ {
+ v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING);
+ }
+ else
+ {
+ v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ }
+
+ __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+ if ( cpu_has_vmx_secondary_exec_control )
+ __vmwrite(SECONDARY_VM_EXEC_CONTROL,
+ v->arch.hvm_vmx.secondary_exec_control);
/* MSR access bitmap. */
if ( cpu_has_vmx_msr_bitmap )
__vmwrite(VMCS_LINK_POINTER_HIGH, ~0UL);
#endif
- __vmwrite(EXCEPTION_BITMAP, (HVM_TRAP_MASK |
- (1U << TRAP_page_fault) |
- (1U << TRAP_no_device)));
+ __vmwrite(EXCEPTION_BITMAP,
+ HVM_TRAP_MASK
+ | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault))
+ | (1U << TRAP_no_device));
v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET;
hvm_update_guest_cr(v, 0);
__vmwrite(TPR_THRESHOLD, 0);
}
+ if ( paging_mode_hap(d) )
+ {
+ __vmwrite(EPT_POINTER, d->arch.hvm_domain.vmx.ept_control.eptp);
+#ifdef CONFIG_X86_PAE
+ __vmwrite(EPT_POINTER_HIGH,
+ d->arch.hvm_domain.vmx.ept_control.eptp >> 32);
+#endif
+ }
+
vmx_vmcs_exit(v);
paging_update_paging_modes(v); /* will update HOST & GUEST_CR3 as reqd */
(uint32_t)vmr(IDT_VECTORING_ERROR_CODE));
printk("TPR Threshold = 0x%02x\n",
(uint32_t)vmr(TPR_THRESHOLD));
+ printk("EPT pointer = 0x%08x%08x\n",
+ (uint32_t)vmr(EPT_POINTER_HIGH), (uint32_t)vmr(EPT_POINTER));
vmx_vmcs_exit(v);
}
static int vmx_domain_initialise(struct domain *d)
{
+ d->arch.hvm_domain.vmx.ept_control.etmt = EPT_DEFAULT_MT;
+ d->arch.hvm_domain.vmx.ept_control.gaw = EPT_DEFAULT_GAW;
+ d->arch.hvm_domain.vmx.ept_control.asr =
+ pagetable_get_pfn(d->arch.phys_table);
+
return vmx_alloc_vlapic_mapping(d);
}
static void vmx_domain_destroy(struct domain *d)
{
+ ept_sync_domain(d);
vmx_free_vlapic_mapping(d);
}
unsigned long mfn = 0;
p2m_type_t p2mt;
- if ( cr0 & X86_CR0_PG )
+ if ( paging_mode_shadow(v->domain) )
{
- mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
- if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
+ if ( cr0 & X86_CR0_PG )
{
- gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
- return -EINVAL;
+ mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
+ if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
+ {
+ gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
+ return -EINVAL;
+ }
}
- }
- if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
- put_page(pagetable_get_page(v->arch.guest_table));
+ if ( hvm_paging_enabled(v) )
+ put_page(pagetable_get_page(v->arch.guest_table));
- v->arch.guest_table = pagetable_from_pfn(mfn);
+ v->arch.guest_table = pagetable_from_pfn(mfn);
+ }
v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
v->arch.hvm_vcpu.guest_cr[3] = cr3;
__vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
}
+static void vmx_load_pdptrs(struct vcpu *v)
+{
+ unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3], mfn;
+ uint64_t *guest_pdptrs;
+ p2m_type_t p2mt;
+ char *p;
+
+ /* EPT needs to load PDPTRS into VMCS for PAE. */
+ if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
+ return;
+
+ if ( cr3 & 0x1fUL )
+ goto crash;
+
+ mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
+ if ( !p2m_is_ram(p2mt) )
+ goto crash;
+
+ p = map_domain_page(mfn);
+
+ guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
+
+ /*
+ * We do not check the PDPTRs for validity. The CPU will do this during
+ * vm entry, and we can handle the failure there and crash the guest.
+ * The only thing we could do better here is #GP instead.
+ */
+
+ vmx_vmcs_enter(v);
+
+ __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]);
+ __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]);
+ __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]);
+ __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]);
+#ifdef CONFIG_X86_PAE
+ __vmwrite(GUEST_PDPTR0_HIGH, guest_pdptrs[0] >> 32);
+ __vmwrite(GUEST_PDPTR1_HIGH, guest_pdptrs[1] >> 32);
+ __vmwrite(GUEST_PDPTR2_HIGH, guest_pdptrs[2] >> 32);
+ __vmwrite(GUEST_PDPTR3_HIGH, guest_pdptrs[3] >> 32);
+#endif
+
+ vmx_vmcs_exit(v);
+
+ unmap_domain_page(p);
+ return;
+
+ crash:
+ domain_crash(v->domain);
+}
+
static void vmx_update_host_cr3(struct vcpu *v)
{
vmx_vmcs_enter(v);
{
case 0: {
unsigned long hw_cr0_mask =
- X86_CR0_NE | X86_CR0_PG | X86_CR0_WP | X86_CR0_PE;
+ X86_CR0_NE | X86_CR0_PG | X86_CR0_PE;
+
+ if ( paging_mode_shadow(v->domain) )
+ hw_cr0_mask |= X86_CR0_WP;
+
+ if ( paging_mode_hap(v->domain) )
+ {
+ /* We manage GUEST_CR3 when guest CR0.PE is zero. */
+ uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING);
+ v->arch.hvm_vmx.exec_control &= ~cr3_ctls;
+ if ( !hvm_paging_enabled(v) )
+ v->arch.hvm_vmx.exec_control |= cr3_ctls;
+ __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+
+ /* Changing CR0.PE can change some bits in real CR4. */
+ vmx_update_guest_cr(v, 4);
+ }
if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
{
/* CR2 is updated in exit stub. */
break;
case 3:
+ if ( paging_mode_hap(v->domain) )
+ {
+ if ( !hvm_paging_enabled(v) )
+ v->arch.hvm_vcpu.hw_cr[3] =
+ v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT];
+ vmx_load_pdptrs(v);
+ }
+
__vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
break;
case 4:
- v->arch.hvm_vcpu.hw_cr[4] =
- v->arch.hvm_vcpu.guest_cr[4] | HVM_CR4_HOST_MASK;
+ v->arch.hvm_vcpu.hw_cr[4] = HVM_CR4_HOST_MASK;
+ if ( paging_mode_hap(v->domain) )
+ v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
+ v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
+ if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
+ {
+ v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
+ v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
+ }
__vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
__vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
break;
* because VMRESUME will flush it for us. */
}
+static void __ept_sync_domain(void *info)
+{
+ struct domain *d = info;
+ __invept(1, d->arch.hvm_domain.vmx.ept_control.eptp, 0);
+}
+void ept_sync_domain(struct domain *d)
+{
+ /* Only if using EPT and this domain has some VCPUs to dirty. */
+ if ( d->arch.hvm_domain.hap_enabled && d->vcpu[0] )
+ on_each_cpu(__ept_sync_domain, d, 1, 1);
+}
static void __vmx_inject_exception(
struct vcpu *v, int trap, int type, int error_code)
return;
}
+ if ( cpu_has_vmx_ept )
+ {
+ printk("VMX: EPT is available.\n");
+ vmx_function_table.hap_supported = 1;
+ }
+
setup_vmcs_dump();
hvm_enable(&vmx_function_table);
share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
set_mmio_p2m_entry(
d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va)));
- d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);
+ d->arch.hvm_domain.vmx.apic_access_mfn = virt_to_mfn(apic_va);
return 0;
}
static void vmx_free_vlapic_mapping(struct domain *d)
{
- unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
+ unsigned long mfn = d->arch.hvm_domain.vmx.apic_access_mfn;
if ( mfn != 0 )
free_xenheap_page(mfn_to_virt(mfn));
}
return;
virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
- apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
+ apic_page_ma = v->domain->arch.hvm_domain.vmx.apic_access_mfn;
apic_page_ma <<= PAGE_SHIFT;
vmx_vmcs_enter(v);
wbinvd();
}
+static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
+{
+ if ( unlikely(((qualification >> 7) & 0x3) != 0x3) )
+ {
+ domain_crash(current->domain);
+ return;
+ }
+
+ handle_mmio();
+}
+
static void vmx_failed_vmentry(unsigned int exit_reason,
struct cpu_user_regs *regs)
{
unsigned long exit_qualification, inst_len = 0;
struct vcpu *v = current;
+ if ( paging_mode_hap(v->domain) && hvm_paging_enabled(v) )
+ v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
+ __vmread(GUEST_CR3);
+
exit_reason = __vmread(VM_EXIT_REASON);
hvmtrace_vmexit(v, regs->eip, exit_reason);
break;
}
+ case EXIT_REASON_EPT_VIOLATION:
+ {
+ paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
+#ifdef CONFIG_X86_PAE
+ gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
+#endif
+ exit_qualification = __vmread(EXIT_QUALIFICATION);
+ ept_handle_violation(exit_qualification, gpa);
+ break;
+ }
+
default:
exit_and_crash:
gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
unsigned long domain_get_maximum_gpfn(struct domain *d)
{
if ( is_hvm_domain(d) )
- return d->arch.p2m.max_mapped_pfn;
+ return d->arch.p2m->max_mapped_pfn;
/* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
return arch_get_max_pfn(d) - 1;
}
obj-y += guest_walk_2level.o
obj-y += guest_walk_3level.o
obj-y += guest_walk_4level.o
+obj-y += p2m-ept.o
guest_levels = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1)))))
guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1))
--- /dev/null
+/*
+ * ept-p2m.c: use the EPT page table as p2m
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <xen/config.h>
+#include <xen/domain_page.h>
+#include <xen/sched.h>
+#include <asm/current.h>
+#include <asm/types.h>
+#include <asm/domain.h>
+#include <asm/p2m.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <xen/iommu.h>
+
+static int ept_next_level(struct domain *d, bool_t read_only,
+ ept_entry_t **table, unsigned long *gfn_remainder,
+ u32 shift)
+{
+ ept_entry_t *ept_entry, *next;
+ u32 index;
+
+ index = *gfn_remainder >> shift;
+ *gfn_remainder &= (1UL << shift) - 1;
+
+ ept_entry = (*table) + index;
+
+ if ( !(ept_entry->epte & 0x7) )
+ {
+ struct page_info *pg;
+
+ if ( read_only )
+ return 0;
+
+ pg = d->arch.p2m->alloc_page(d);
+ if ( pg == NULL )
+ return 0;
+
+ pg->count_info = 1;
+ pg->u.inuse.type_info = 1 | PGT_validated;
+ list_add_tail(&pg->list, &d->arch.p2m->pages);
+
+ ept_entry->emt = 0;
+ ept_entry->sp_avail = 0;
+ ept_entry->avail1 = 0;
+ ept_entry->mfn = page_to_mfn(pg);
+ ept_entry->rsvd = 0;
+ ept_entry->avail2 = 0;
+ /* last step */
+ ept_entry->r = ept_entry->w = ept_entry->x = 1;
+ }
+
+ next = map_domain_page(ept_entry->mfn);
+ unmap_domain_page(*table);
+ *table = next;
+
+ return 1;
+}
+
+static int
+ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+{
+ ept_entry_t *table =
+ map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+ unsigned long gfn_remainder = gfn;
+ ept_entry_t *ept_entry = NULL;
+ u32 index;
+ int i, rv = 0;
+
+ /* Should check if gfn obeys GAW here */
+
+ for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
+ if ( !ept_next_level(d, 0, &table, &gfn_remainder,
+ i * EPT_TABLE_ORDER) )
+ goto out;
+
+ index = gfn_remainder;
+ ept_entry = table + index;
+
+ if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
+ {
+ /* Track the highest gfn for which we have ever had a valid mapping */
+ if ( gfn > d->arch.p2m->max_mapped_pfn )
+ d->arch.p2m->max_mapped_pfn = gfn;
+
+ ept_entry->emt = EPT_DEFAULT_MT;
+ ept_entry->sp_avail = 0;
+ ept_entry->avail1 = p2mt;
+ ept_entry->mfn = mfn_x(mfn);
+ ept_entry->rsvd = 0;
+ ept_entry->avail2 = 0;
+ /* last step */
+ ept_entry->r = ept_entry->w = ept_entry->x = 1;
+ }
+ else
+ ept_entry->epte = 0;
+
+ /* Success */
+ rv = 1;
+
+ out:
+ unmap_domain_page(table);
+
+ ept_sync_domain(d);
+
+ /* If p2m table is shared with vtd page-table. */
+ if ( iommu_enabled && is_hvm_domain(d) && (p2mt == p2m_mmio_direct) )
+ iommu_flush(d, gfn, (u64*)ept_entry);
+
+ return rv;
+}
+
+/* Read ept p2m entries */
+static mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t)
+{
+ ept_entry_t *table =
+ map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+ unsigned long gfn_remainder = gfn;
+ ept_entry_t *ept_entry;
+ u32 index;
+ int i;
+ mfn_t mfn = _mfn(INVALID_MFN);
+
+ *t = p2m_mmio_dm;
+
+ /* This pfn is higher than the highest the p2m map currently holds */
+ if ( gfn > d->arch.p2m->max_mapped_pfn )
+ goto out;
+
+ /* Should check if gfn obeys GAW here. */
+
+ for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
+ if ( !ept_next_level(d, 1, &table, &gfn_remainder,
+ i * EPT_TABLE_ORDER) )
+ goto out;
+
+ index = gfn_remainder;
+ ept_entry = table + index;
+
+ if ( (ept_entry->epte & 0x7) == 0x7 )
+ {
+ if ( ept_entry->avail1 != p2m_invalid )
+ {
+ *t = ept_entry->avail1;
+ mfn = _mfn(ept_entry->mfn);
+ }
+ }
+
+ out:
+ unmap_domain_page(table);
+ return mfn;
+}
+
+static mfn_t ept_get_entry_current(unsigned long gfn, p2m_type_t *t)
+{
+ return ept_get_entry(current->domain, gfn, t);
+}
+
+void ept_p2m_init(struct domain *d)
+{
+ d->arch.p2m->set_entry = ept_set_entry;
+ d->arch.p2m->get_entry = ept_get_entry;
+ d->arch.p2m->get_entry_current = ept_get_entry_current;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
#include <asm/page.h>
#include <asm/paging.h>
#include <asm/p2m.h>
+#include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */
#include <xen/iommu.h>
/* Debugging and auditing of the P2M code? */
* Locking discipline: always acquire this lock before the shadow or HAP one
*/
-#define p2m_lock_init(_d) \
- do { \
- spin_lock_init(&(_d)->arch.p2m.lock); \
- (_d)->arch.p2m.locker = -1; \
- (_d)->arch.p2m.locker_function = "nobody"; \
+#define p2m_lock_init(_p2m) \
+ do { \
+ spin_lock_init(&(_p2m)->lock); \
+ (_p2m)->locker = -1; \
+ (_p2m)->locker_function = "nobody"; \
} while (0)
-#define p2m_lock(_d) \
- do { \
- if ( unlikely((_d)->arch.p2m.locker == current->processor) )\
- { \
- printk("Error: p2m lock held by %s\n", \
- (_d)->arch.p2m.locker_function); \
- BUG(); \
- } \
- spin_lock(&(_d)->arch.p2m.lock); \
- ASSERT((_d)->arch.p2m.locker == -1); \
- (_d)->arch.p2m.locker = current->processor; \
- (_d)->arch.p2m.locker_function = __func__; \
+#define p2m_lock(_p2m) \
+ do { \
+ if ( unlikely((_p2m)->locker == current->processor) ) \
+ { \
+ printk("Error: p2m lock held by %s\n", \
+ (_p2m)->locker_function); \
+ BUG(); \
+ } \
+ spin_lock(&(_p2m)->lock); \
+ ASSERT((_p2m)->locker == -1); \
+ (_p2m)->locker = current->processor; \
+ (_p2m)->locker_function = __func__; \
} while (0)
-#define p2m_unlock(_d) \
- do { \
- ASSERT((_d)->arch.p2m.locker == current->processor); \
- (_d)->arch.p2m.locker = -1; \
- (_d)->arch.p2m.locker_function = "nobody"; \
- spin_unlock(&(_d)->arch.p2m.lock); \
+#define p2m_unlock(_p2m) \
+ do { \
+ ASSERT((_p2m)->locker == current->processor); \
+ (_p2m)->locker = -1; \
+ (_p2m)->locker_function = "nobody"; \
+ spin_unlock(&(_p2m)->lock); \
} while (0)
-
/* Printouts */
#define P2M_PRINTK(_f, _a...) \
debugtrace_printk("p2m: %s(): " _f, __func__, ##_a)
l1_pgentry_t *p2m_entry;
l1_pgentry_t new_entry;
void *next;
- ASSERT(d->arch.p2m.alloc_page);
+ ASSERT(d->arch.p2m->alloc_page);
if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
shift, max)) )
if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
{
- struct page_info *pg = d->arch.p2m.alloc_page(d);
+ struct page_info *pg = d->arch.p2m->alloc_page(d);
if ( pg == NULL )
return 0;
- list_add_tail(&pg->list, &d->arch.p2m.pages);
+ list_add_tail(&pg->list, &d->arch.p2m->pages);
pg->u.inuse.type_info = type | 1 | PGT_validated;
pg->count_info = 1;
// Returns 0 on error (out of memory)
static int
-set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
{
// XXX -- this might be able to be faster iff current->domain == d
mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
ASSERT(p2m_entry);
/* Track the highest gfn for which we have ever had a valid mapping */
- if ( mfn_valid(mfn) && (gfn > d->arch.p2m.max_mapped_pfn) )
- d->arch.p2m.max_mapped_pfn = gfn;
+ if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
+ d->arch.p2m->max_mapped_pfn = gfn;
if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
return rv;
}
+static mfn_t
+p2m_gfn_to_mfn(struct domain *d, unsigned long gfn, p2m_type_t *t)
+{
+ mfn_t mfn;
+ paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
+ l2_pgentry_t *l2e;
+ l1_pgentry_t *l1e;
+
+ ASSERT(paging_mode_translate(d));
+
+ /* XXX This is for compatibility with the old model, where anything not
+ * XXX marked as RAM was considered to be emulated MMIO space.
+ * XXX Once we start explicitly registering MMIO regions in the p2m
+ * XXX we will return p2m_invalid for unmapped gfns */
+ *t = p2m_mmio_dm;
+
+ mfn = pagetable_get_mfn(d->arch.phys_table);
+
+ if ( gfn > d->arch.p2m->max_mapped_pfn )
+ /* This pfn is higher than the highest the p2m map currently holds */
+ return _mfn(INVALID_MFN);
+
+#if CONFIG_PAGING_LEVELS >= 4
+ {
+ l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
+ l4e += l4_table_offset(addr);
+ if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
+ {
+ unmap_domain_page(l4e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l4e_get_pfn(*l4e));
+ unmap_domain_page(l4e);
+ }
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+ {
+ l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
+#if CONFIG_PAGING_LEVELS == 3
+ /* On PAE hosts the p2m has eight l3 entries, not four (see
+ * shadow_set_p2m_entry()) so we can't use l3_table_offset.
+ * Instead, just count the number of l3es from zero. It's safe
+ * to do this because we already checked that the gfn is within
+ * the bounds of the p2m. */
+ l3e += (addr >> L3_PAGETABLE_SHIFT);
+#else
+ l3e += l3_table_offset(addr);
+#endif
+ if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+ {
+ unmap_domain_page(l3e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l3e_get_pfn(*l3e));
+ unmap_domain_page(l3e);
+ }
+#endif
+
+ l2e = map_domain_page(mfn_x(mfn));
+ l2e += l2_table_offset(addr);
+ if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+ {
+ unmap_domain_page(l2e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l2e_get_pfn(*l2e));
+ unmap_domain_page(l2e);
+
+ l1e = map_domain_page(mfn_x(mfn));
+ l1e += l1_table_offset(addr);
+ if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+ {
+ unmap_domain_page(l1e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l1e_get_pfn(*l1e));
+ *t = p2m_flags_to_type(l1e_get_flags(*l1e));
+ unmap_domain_page(l1e);
+
+ ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+ return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+}
+
+/* Read the current domain's p2m table (through the linear mapping). */
+static mfn_t p2m_gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t)
+{
+ mfn_t mfn = _mfn(INVALID_MFN);
+ p2m_type_t p2mt = p2m_mmio_dm;
+ /* XXX This is for compatibility with the old model, where anything not
+ * XXX marked as RAM was considered to be emulated MMIO space.
+ * XXX Once we start explicitly registering MMIO regions in the p2m
+ * XXX we will return p2m_invalid for unmapped gfns */
+
+ if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
+ {
+ l1_pgentry_t l1e = l1e_empty();
+ int ret;
+
+ ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START)
+ / sizeof(l1_pgentry_t));
+
+ /* Need to __copy_from_user because the p2m is sparse and this
+ * part might not exist */
+ ret = __copy_from_user(&l1e,
+ &phys_to_machine_mapping[gfn],
+ sizeof(l1e));
+
+ if ( ret == 0 ) {
+ p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+ ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+ if ( p2m_is_valid(p2mt) )
+ mfn = _mfn(l1e_get_pfn(l1e));
+ else
+ /* XXX see above */
+ p2mt = p2m_mmio_dm;
+ }
+ }
+
+ *t = p2mt;
+ return mfn;
+}
/* Init the datastructures for later use by the p2m code */
-void p2m_init(struct domain *d)
+int p2m_init(struct domain *d)
{
- p2m_lock_init(d);
- INIT_LIST_HEAD(&d->arch.p2m.pages);
+ struct p2m_domain *p2m;
+
+ p2m = xmalloc(struct p2m_domain);
+ if ( p2m == NULL )
+ return -ENOMEM;
+
+ d->arch.p2m = p2m;
+
+ p2m_lock_init(p2m);
+ INIT_LIST_HEAD(&p2m->pages);
+
+ p2m->set_entry = p2m_set_entry;
+ p2m->get_entry = p2m_gfn_to_mfn;
+ p2m->get_entry_current = p2m_gfn_to_mfn_current;
+
+ if ( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
+ ept_p2m_init(d);
+
+ return 0;
}
+static inline
+int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+{
+ return d->arch.p2m->set_entry(d, gfn, mfn, p2mt);
+}
// Allocate a new p2m table for a domain.
//
struct page_info *page, *p2m_top;
unsigned int page_count = 0;
unsigned long gfn = -1UL;
+ struct p2m_domain *p2m = d->arch.p2m;
- p2m_lock(d);
+ p2m_lock(p2m);
if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
{
P2M_ERROR("p2m already allocated for this domain\n");
- p2m_unlock(d);
+ p2m_unlock(p2m);
return -EINVAL;
}
P2M_PRINTK("allocating p2m table\n");
- d->arch.p2m.alloc_page = alloc_page;
- d->arch.p2m.free_page = free_page;
+ p2m->alloc_page = alloc_page;
+ p2m->free_page = free_page;
- p2m_top = d->arch.p2m.alloc_page(d);
+ p2m_top = p2m->alloc_page(d);
if ( p2m_top == NULL )
{
- p2m_unlock(d);
+ p2m_unlock(p2m);
return -ENOMEM;
}
- list_add_tail(&p2m_top->list, &d->arch.p2m.pages);
+ list_add_tail(&p2m_top->list, &p2m->pages);
p2m_top->count_info = 1;
p2m_top->u.inuse.type_info =
#endif
P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
- p2m_unlock(d);
+ p2m_unlock(p2m);
return 0;
error:
P2M_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%"
PRI_mfn "\n", gfn, mfn_x(mfn));
- p2m_unlock(d);
+ p2m_unlock(p2m);
return -ENOMEM;
}
{
struct list_head *entry, *n;
struct page_info *pg;
+ struct p2m_domain *p2m = d->arch.p2m;
- p2m_lock(d);
+ p2m_lock(p2m);
d->arch.phys_table = pagetable_null();
- list_for_each_safe(entry, n, &d->arch.p2m.pages)
+ list_for_each_safe(entry, n, &p2m->pages)
{
pg = list_entry(entry, struct page_info, list);
list_del(entry);
- d->arch.p2m.free_page(d, pg);
+ p2m->free_page(d, pg);
}
- p2m_unlock(d);
+ p2m_unlock(p2m);
}
-mfn_t
-gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t)
-/* Read another domain's p2m entries */
+void p2m_final_teardown(struct domain *d)
{
- mfn_t mfn;
- paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
- l2_pgentry_t *l2e;
- l1_pgentry_t *l1e;
-
- ASSERT(paging_mode_translate(d));
-
- /* XXX This is for compatibility with the old model, where anything not
- * XXX marked as RAM was considered to be emulated MMIO space.
- * XXX Once we start explicitly registering MMIO regions in the p2m
- * XXX we will return p2m_invalid for unmapped gfns */
- *t = p2m_mmio_dm;
-
- mfn = pagetable_get_mfn(d->arch.phys_table);
-
- if ( gfn > d->arch.p2m.max_mapped_pfn )
- /* This pfn is higher than the highest the p2m map currently holds */
- return _mfn(INVALID_MFN);
-
-#if CONFIG_PAGING_LEVELS >= 4
- {
- l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
- l4e += l4_table_offset(addr);
- if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
- {
- unmap_domain_page(l4e);
- return _mfn(INVALID_MFN);
- }
- mfn = _mfn(l4e_get_pfn(*l4e));
- unmap_domain_page(l4e);
- }
-#endif
-#if CONFIG_PAGING_LEVELS >= 3
- {
- l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
-#if CONFIG_PAGING_LEVELS == 3
- /* On PAE hosts the p2m has eight l3 entries, not four (see
- * shadow_set_p2m_entry()) so we can't use l3_table_offset.
- * Instead, just count the number of l3es from zero. It's safe
- * to do this because we already checked that the gfn is within
- * the bounds of the p2m. */
- l3e += (addr >> L3_PAGETABLE_SHIFT);
-#else
- l3e += l3_table_offset(addr);
-#endif
- if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
- {
- unmap_domain_page(l3e);
- return _mfn(INVALID_MFN);
- }
- mfn = _mfn(l3e_get_pfn(*l3e));
- unmap_domain_page(l3e);
- }
-#endif
-
- l2e = map_domain_page(mfn_x(mfn));
- l2e += l2_table_offset(addr);
- if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
- {
- unmap_domain_page(l2e);
- return _mfn(INVALID_MFN);
- }
- mfn = _mfn(l2e_get_pfn(*l2e));
- unmap_domain_page(l2e);
-
- l1e = map_domain_page(mfn_x(mfn));
- l1e += l1_table_offset(addr);
- if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
- {
- unmap_domain_page(l1e);
- return _mfn(INVALID_MFN);
- }
- mfn = _mfn(l1e_get_pfn(*l1e));
- *t = p2m_flags_to_type(l1e_get_flags(*l1e));
- unmap_domain_page(l1e);
-
- ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
- return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+ xfree(d->arch.p2m);
+ d->arch.p2m = NULL;
}
#if P2M_AUDIT
set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
}
- if ( test_linear && (gfn <= d->arch.p2m.max_mapped_pfn) )
+ if ( test_linear && (gfn <= d->arch.p2m->max_mapped_pfn) )
{
lp2mfn = mfn_x(gfn_to_mfn_current(gfn, &type));
if ( lp2mfn != mfn_x(p2mfn) )
guest_physmap_remove_page(struct domain *d, unsigned long gfn,
unsigned long mfn)
{
- p2m_lock(d);
+ p2m_lock(d->arch.p2m);
audit_p2m(d);
p2m_remove_page(d, gfn, mfn);
audit_p2m(d);
- p2m_unlock(d);
+ p2m_unlock(d->arch.p2m);
}
int
*/
if ( paging_mode_hap(d) && (gfn > 0xfffffUL) )
{
- if ( !test_and_set_bool(d->arch.hvm_domain.amd_npt_4gb_warning) )
+ if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
" 4GB: specify 'hap=0' domain config option.\n",
d->domain_id);
}
#endif
- p2m_lock(d);
+ p2m_lock(d->arch.p2m);
audit_p2m(d);
P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
}
audit_p2m(d);
- p2m_unlock(d);
+ p2m_unlock(d->arch.p2m);
return rc;
}
if ( pagetable_get_pfn(d->arch.phys_table) == 0 )
return;
- p2m_lock(d);
+ p2m_lock(d->arch.p2m);
#if CONFIG_PAGING_LEVELS == 4
l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
unmap_domain_page(l2e);
#endif
- p2m_unlock(d);
+ p2m_unlock(d->arch.p2m);
}
/* Modify the p2m type of a single gfn from ot to nt, returning the
p2m_type_t pt;
mfn_t mfn;
- p2m_lock(d);
+ p2m_lock(d->arch.p2m);
mfn = gfn_to_mfn(d, gfn, &pt);
if ( pt == ot )
set_p2m_entry(d, gfn, mfn, nt);
- p2m_unlock(d);
+ p2m_unlock(d->arch.p2m);
return pt;
}
/* CODE FOR PAGING SUPPORT */
/************************************************/
/* Domain paging struct initialization. */
-void paging_domain_init(struct domain *d)
+int paging_domain_init(struct domain *d)
{
- p2m_init(d);
+ int rc;
+
+ if ( (rc = p2m_init(d)) != 0 )
+ return rc;
/* The order of the *_init calls below is important, as the later
* ones may rewrite some common fields. Shadow pagetables are the
/* ... but we will use hardware assistance if it's available. */
if ( hap_enabled(d) )
hap_domain_init(d);
+
+ return 0;
}
/* vcpu paging struct initialization goes here */
hap_final_teardown(d);
else
shadow_final_teardown(d);
+
+ p2m_final_teardown(d);
}
/* Enable an arbitrary paging-assistance mode. Call once at domain
#include <public/domctl.h>
#include <xsm/xsm.h>
+DEFINE_SPINLOCK(domctl_lock);
+
extern long arch_do_domctl(
struct xen_domctl *op, XEN_GUEST_HANDLE(xen_domctl_t) u_domctl);
{
long ret = 0;
struct xen_domctl curop, *op = &curop;
- static DEFINE_SPINLOCK(domctl_lock);
if ( !IS_PRIV(current->domain) )
return -EPERM;
#include <xen/sched.h>
#include <xen/xmalloc.h>
#include <xen/domain_page.h>
+#include <asm/paging.h>
#include <xen/iommu.h>
#include <xen/numa.h>
#include "iommu.h"
}
p2m_table = mfn_x(pagetable_get_mfn(d->arch.phys_table));
-#if CONFIG_PAGING_LEVELS == 3
- if ( !hd->pgd )
+ if ( paging_mode_hap(d) )
{
+ int level = agaw_to_level(hd->agaw);
+ struct dma_pte *dpte = NULL;
+ mfn_t pgd_mfn;
+
+ switch ( level )
+ {
+ case VTD_PAGE_TABLE_LEVEL_3:
+ dpte = map_domain_page(p2m_table);
+ if ( !dma_pte_present(*dpte) )
+ {
+ gdprintk(XENLOG_ERR VTDPREFIX,
+ "iommu_set_pgd: second level wasn't there\n");
+ unmap_domain_page(dpte);
+ return;
+ }
+ pgd_mfn = _mfn(dma_pte_addr(*dpte) >> PAGE_SHIFT_4K);
+ unmap_domain_page(dpte);
+ hd->pgd = maddr_to_virt(pagetable_get_paddr(
+ pagetable_from_mfn(pgd_mfn)));
+ break;
+ case VTD_PAGE_TABLE_LEVEL_4:
+ pgd_mfn = _mfn(p2m_table);
+ hd->pgd = maddr_to_virt(pagetable_get_paddr(
+ pagetable_from_mfn(pgd_mfn)));
+ break;
+ default:
+ gdprintk(XENLOG_ERR VTDPREFIX,
+ "iommu_set_pgd:Unsupported p2m table sharing level!\n");
+ break;
+ }
+ }
+ else
+ {
+#if CONFIG_PAGING_LEVELS == 3
int level = agaw_to_level(hd->agaw);
struct dma_pte *pmd = NULL;
struct dma_pte *pgd = NULL;
}
unmap_domain_page(l3e);
spin_unlock_irqrestore(&hd->mapping_lock, flags);
- }
#elif CONFIG_PAGING_LEVELS == 4
- if ( !hd->pgd )
- {
int level = agaw_to_level(hd->agaw);
l3_pgentry_t *l3e;
mfn_t pgd_mfn;
"iommu_set_pgd:Unsupported p2m table sharing level!\n");
break;
}
- }
#endif
+ }
gdprintk(XENLOG_INFO VTDPREFIX,
"iommu_set_pgd: hd->pgd = %p\n", hd->pgd);
}
unsigned int p2m_pages; /* number of pages allocates to p2m */
};
-/************************************************/
-/* p2m handling */
-/************************************************/
-struct p2m_domain {
- /* Lock that protects updates to the p2m */
- spinlock_t lock;
- int locker; /* processor which holds the lock */
- const char *locker_function; /* Func that took it */
-
- /* Pages used to construct the p2m */
- struct list_head pages;
-
- /* Functions to call to get or free pages for the p2m */
- struct page_info * (*alloc_page )(struct domain *d);
- void (*free_page )(struct domain *d,
- struct page_info *pg);
-
- /* Highest guest frame that's ever been mapped in the p2m */
- unsigned long max_mapped_pfn;
-};
-
/************************************************/
/* common paging data structure */
/************************************************/
struct shadow_vcpu shadow;
};
+struct p2m_domain;
+
struct arch_domain
{
l1_pgentry_t *mm_perdomain_pt;
struct hvm_domain hvm_domain;
struct paging_domain paging;
- struct p2m_domain p2m ;
+ struct p2m_domain *p2m;
/* Shadow translated domain: P2M mapping */
pagetable_t phys_table;
#include <asm/hvm/vioapic.h>
#include <asm/hvm/io.h>
#include <xen/hvm/iommu.h>
+#include <asm/hvm/vmx/vmcs.h>
+#include <asm/hvm/svm/vmcb.h>
#include <public/hvm/params.h>
#include <public/hvm/save.h>
uint64_t params[HVM_NR_PARAMS];
- unsigned long vmx_apic_access_mfn;
-
/* Memory ranges with pinned cache attributes. */
struct list_head pinned_cacheattr_ranges;
/* Pass-through */
struct hvm_iommu hvm_iommu;
-#if CONFIG_PAGING_LEVELS == 3
- bool_t amd_npt_4gb_warning;
-#endif
bool_t hap_enabled;
bool_t qemu_mapcache_invalidate;
+
+ union {
+ struct vmx_domain vmx;
+ struct svm_domain svm;
+ };
};
#endif /* __ASM_X86_HVM_DOMAIN_H__ */
u64 res16[301];
} __attribute__ ((packed));
+struct svm_domain {
+#if CONFIG_PAGING_LEVELS == 3
+ bool_t npt_4gb_warning;
+#endif
+};
+
struct arch_svm_struct {
struct vmcb_struct *vmcb;
u64 vmcb_pa;
unsigned long msrs[VMX_MSR_COUNT];
};
+#define EPT_DEFAULT_MT 6
+#define EPT_DEFAULT_GAW 3
+
+struct vmx_domain {
+ unsigned long apic_access_mfn;
+
+ union {
+ struct {
+ u64 etmt :3,
+ gaw :3,
+ rsvd :6,
+ asr :52;
+ };
+ u64 eptp;
+ } ept_control;
+};
+
struct arch_vmx_struct {
/* Virtual address of VMCS. */
struct vmcs_struct *vmcs;
/* Cache of cpu execution control. */
u32 exec_control;
+ u32 secondary_exec_control;
/* PMU */
struct vpmu_struct vpmu;
#define CPU_BASED_MWAIT_EXITING 0x00000400
#define CPU_BASED_RDPMC_EXITING 0x00000800
#define CPU_BASED_RDTSC_EXITING 0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING 0x00008000
+#define CPU_BASED_CR3_STORE_EXITING 0x00010000
#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
#define CPU_BASED_CR8_STORE_EXITING 0x00100000
#define CPU_BASED_TPR_SHADOW 0x00200000
extern u32 vmx_vmentry_control;
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
extern u32 vmx_secondary_exec_control;
(vmx_pin_based_exec_control & PIN_BASED_VIRTUAL_NMIS)
#define cpu_has_vmx_msr_bitmap \
(vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_MSR_BITMAP)
+#define cpu_has_vmx_secondary_exec_control \
+ (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+#define cpu_has_vmx_ept \
+ (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)
/* GUEST_INTERRUPTIBILITY_INFO flags. */
#define VMX_INTR_SHADOW_STI 0x00000001
VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
APIC_ACCESS_ADDR = 0x00002014,
- APIC_ACCESS_ADDR_HIGH = 0x00002015,
+ APIC_ACCESS_ADDR_HIGH = 0x00002015,
+ EPT_POINTER = 0x0000201a,
+ EPT_POINTER_HIGH = 0x0000201b,
+ GUEST_PHYSICAL_ADDRESS = 0x00002400,
+ GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
VMCS_LINK_POINTER = 0x00002800,
VMCS_LINK_POINTER_HIGH = 0x00002801,
GUEST_IA32_DEBUGCTL = 0x00002802,
GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
+ GUEST_PDPTR0 = 0x0000280a,
+ GUEST_PDPTR0_HIGH = 0x0000280b,
+ GUEST_PDPTR1 = 0x0000280c,
+ GUEST_PDPTR1_HIGH = 0x0000280d,
+ GUEST_PDPTR2 = 0x0000280e,
+ GUEST_PDPTR2_HIGH = 0x0000280f,
+ GUEST_PDPTR3 = 0x00002810,
+ GUEST_PDPTR3_HIGH = 0x00002811,
PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
EXCEPTION_BITMAP = 0x00004004,
#include <asm/types.h>
#include <asm/regs.h>
#include <asm/processor.h>
-#include <asm/hvm/vmx/vmcs.h>
#include <asm/i387.h>
+#include <asm/hvm/support.h>
#include <asm/hvm/trace.h>
+#include <asm/hvm/vmx/vmcs.h>
+
+typedef union {
+ struct {
+ u64 r : 1,
+ w : 1,
+ x : 1,
+ emt : 4,
+ sp_avail : 1,
+ avail1 : 4,
+ mfn : 45,
+ rsvd : 5,
+ avail2 : 2;
+ };
+ u64 epte;
+} ept_entry_t;
+
+#define EPT_TABLE_ORDER 9
void vmx_asm_vmexit_handler(struct cpu_user_regs);
void vmx_asm_do_vmentry(void);
#define EXIT_REASON_MACHINE_CHECK 41
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
#define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_EPT_VIOLATION 48
+#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_WBINVD 54
/*
#define VMREAD_OPCODE ".byte 0x0f,0x78\n"
#define VMRESUME_OPCODE ".byte 0x0f,0x01,0xc3\n"
#define VMWRITE_OPCODE ".byte 0x0f,0x79\n"
+#define INVEPT_OPCODE ".byte 0x66,0x0f,0x38,0x80\n" /* m128,r64/32 */
#define VMXOFF_OPCODE ".byte 0x0f,0x01,0xc4\n"
#define VMXON_OPCODE ".byte 0xf3,0x0f,0xc7\n"
+#define MODRM_EAX_08 ".byte 0x08\n" /* ECX, [EAX] */
#define MODRM_EAX_06 ".byte 0x30\n" /* [EAX], with reg/opcode: /6 */
#define MODRM_EAX_07 ".byte 0x38\n" /* [EAX], with reg/opcode: /7 */
-#define MODRM_EAX_ECX ".byte 0xc1\n" /* [EAX], [ECX] */
+#define MODRM_EAX_ECX ".byte 0xc1\n" /* EAX, ECX */
static inline void __vmptrld(u64 addr)
{
__vmwrite(field, __vmread(field) & ~(1UL << bit));
}
+static inline void __invept(int ext, u64 eptp, u64 gpa)
+{
+ struct {
+ u64 eptp, gpa;
+ } operand = {eptp, gpa};
+
+ __asm__ __volatile__ ( INVEPT_OPCODE
+ MODRM_EAX_08
+ /* CF==1 or ZF==1 --> rc = -1 */
+ "ja 1f ; ud2 ; 1:\n"
+ :
+ : "a" (&operand), "c" (ext)
+ : "memory");
+}
+
+static inline void ept_sync_all(void)
+{
+ if ( !current->domain->arch.hvm_domain.hap_enabled )
+ return;
+
+ __invept(2, 0, 0);
+}
+
+void ept_sync_domain(struct domain *d);
+
static inline void __vmxoff(void)
{
asm volatile (
void vmx_inject_extint(struct vcpu *v, int trap);
void vmx_inject_nmi(struct vcpu *v);
+void ept_p2m_init(struct domain *d);
+
#endif /* __ASM_X86_HVM_VMX_VMX_H__ */
#ifndef _XEN_P2M_H
#define _XEN_P2M_H
+#include <xen/config.h>
+#include <xen/paging.h>
/*
* The phys_to_machine_mapping maps guest physical frame numbers
#define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
#define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES))
+struct p2m_domain {
+ /* Lock that protects updates to the p2m */
+ spinlock_t lock;
+ int locker; /* processor which holds the lock */
+ const char *locker_function; /* Func that took it */
+
+ /* Pages used to construct the p2m */
+ struct list_head pages;
+
+ /* Functions to call to get or free pages for the p2m */
+ struct page_info * (*alloc_page )(struct domain *d);
+ void (*free_page )(struct domain *d,
+ struct page_info *pg);
+ int (*set_entry )(struct domain *d, unsigned long gfn,
+ mfn_t mfn, p2m_type_t p2mt);
+ mfn_t (*get_entry )(struct domain *d, unsigned long gfn,
+ p2m_type_t *p2mt);
+ mfn_t (*get_entry_current)(unsigned long gfn,
+ p2m_type_t *p2mt);
+
+ /* Highest guest frame that's ever been mapped in the p2m */
+ unsigned long max_mapped_pfn;
+};
+
/* Extract the type from the PTE flags that store it */
static inline p2m_type_t p2m_flags_to_type(unsigned long flags)
{
/* Type is stored in the "available" bits, 9, 10 and 11 */
return (flags >> 9) & 0x7;
}
-
-/* Read the current domain's p2m table (through the linear mapping). */
+
+/* Read the current domain's p2m table. */
static inline mfn_t gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t)
{
- mfn_t mfn = _mfn(INVALID_MFN);
- p2m_type_t p2mt = p2m_mmio_dm;
- /* XXX This is for compatibility with the old model, where anything not
- * XXX marked as RAM was considered to be emulated MMIO space.
- * XXX Once we start explicitly registering MMIO regions in the p2m
- * XXX we will return p2m_invalid for unmapped gfns */
-
- if ( gfn <= current->domain->arch.p2m.max_mapped_pfn )
- {
- l1_pgentry_t l1e = l1e_empty();
- int ret;
-
- ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START)
- / sizeof(l1_pgentry_t));
-
- /* Need to __copy_from_user because the p2m is sparse and this
- * part might not exist */
- ret = __copy_from_user(&l1e,
- &phys_to_machine_mapping[gfn],
- sizeof(l1e));
-
- if ( ret == 0 ) {
- p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
- ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
- if ( p2m_is_valid(p2mt) )
- mfn = _mfn(l1e_get_pfn(l1e));
- else
- /* XXX see above */
- p2mt = p2m_mmio_dm;
- }
- }
-
- *t = p2mt;
- return mfn;
+ return current->domain->arch.p2m->get_entry_current(gfn, t);
}
/* Read another domain's P2M table, mapping pages as we go */
-mfn_t gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t);
+static inline
+mfn_t gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t)
+{
+ return d->arch.p2m->get_entry(d, gfn, t);
+}
/* General conversion function from gfn to mfn */
#define gfn_to_mfn(d, g, t) _gfn_to_mfn((d), (g), (t))
}
if ( likely(current->domain == d) )
return gfn_to_mfn_current(gfn, t);
- else
+ else
return gfn_to_mfn_foreign(d, gfn, t);
}
/* Init the datastructures for later use by the p2m code */
-void p2m_init(struct domain *d);
+int p2m_init(struct domain *d);
/* Allocate a new p2m table for a domain.
*
/* Return all the p2m resources to Xen. */
void p2m_teardown(struct domain *d);
+void p2m_final_teardown(struct domain *d);
/* Add a page to a domain's p2m table */
int guest_physmap_add_entry(struct domain *d, unsigned long gfn,
/* Set up the paging-assistance-specific parts of a domain struct at
* start of day. Called for every domain from arch_domain_create() */
-void paging_domain_init(struct domain *d);
+int paging_domain_init(struct domain *d);
/* Handler for paging-control ops: operations from user-space to enable
* and disable ephemeral shadow modes (test mode and log-dirty mode) and
/* Boolean: Enable virtual HPET (high-precision event timer)? (x86-only) */
#define HVM_PARAM_HPET_ENABLED 11
+#define HVM_PARAM_IDENT_PT 12
-#define HVM_NR_PARAMS 12
+#define HVM_NR_PARAMS 13
#endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
int cmd,
XEN_GUEST_HANDLE(void) arg);
+extern spinlock_t domctl_lock;
extern long
do_domctl(
XEN_GUEST_HANDLE(xen_domctl_t) u_domctl);